For the reader: Click the black “code” box on the right hand side to see or hide the R code behind the analysis results and graphs. The results are arranged to separate tabs by site/area so that it’s easy to look at just the results of the site of interest, instead of scrolling accidentally down to a different site and getting confused.
The results for each site consist of four parts: pXRF, particle size distribution, loss on ignition, and colorimetry.
From literature we can identify three rough classes of elements that could be used to analyze mudbricks from pXRF results:
Good: Rb, Sr, Y, Zr, Nb, Th, Pb, Cu, Zn, Fe
Neutral: V, Cr, Co, Ni, Ca, Ti
Bad: Na, P, Ba (+ Ca, Al)
The “bad” elements were excluded from the pXRF results from the get-go. After scaling and averaging the remaining values we ended up with two analysis datasets: “pxrf_all” that contains all the elements that have non-zero results, and “pxrf_no_na”, in which all the elements that have even one NA result are omitted.
We end up with 20 elements in the “pxrf_all” dataset, and 12 elements in the “pxrf_no_na” dataset.
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read the raw data
pxrf <- read.xlsx("data/pxrf/Raw_AY.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
values <- c("MgO", "Al2O3", "SiO2",
"S", "Cl", "K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
Good: Fe, Cu, Zn, Rb, Sr, Zr
Neutral: Ti, CaO
Inconclusive: Al2O2, SiO2, K2O, Mn
glimpse(pxrf_no_na)
## Rows: 47
## Columns: 14
## $ Area <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2 <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ K2O <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ Mn <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Cu <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ Rb <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Zr <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~
pxrf_all:
Good: Fe, Cu, Zn, Rb, Sr, Y, Zr, Nb
Neutral: Ni, CaO, Ti, V
Inconclusive: As, MgO, Al2O2, SiO2, S, Cl, K2O, Mn
glimpse(pxrf_all)
## Rows: 47
## Columns: 22
## $ Area <fct> Area D Wall, Area D Wall, Area D Wall, Area D Wall, Area D Wall,~
## $ Type <fct> mud mortar, mud plaster, mud plaster, mud plaster, mud plaster, ~
## $ MgO <dbl> -0.51417215, 0.64100358, 0.22606806, -0.12125145, 0.61130727, -0~
## $ Al2O3 <dbl> -0.83430964, 0.28804627, 0.92438017, -0.75653070, 0.83252855, 0.~
## $ SiO2 <dbl> -1.07372922, 0.46591062, 0.91277017, -0.41768748, 1.37707345, 0.~
## $ S <dbl> -0.2323566, -0.2339645, NA, NA, -0.1447233, NA, NA, NA, NA, NA, ~
## $ Cl <dbl> -0.45906368, 0.66024144, 0.47946871, -0.29005356, 0.12535227, -0~
## $ K2O <dbl> -0.9354224, -0.1969237, 0.4729661, -0.8000151, 0.3529005, 0.9275~
## $ CaO <dbl> -0.31855193, -0.76710354, 1.55087898, -0.52286381, -0.30804896, ~
## $ Ti <dbl> -0.87635022, -0.13169866, 0.36204744, -0.56350248, 0.12413530, 0~
## $ V <dbl> NA, NA, -0.009516182, NA, NA, 0.216968943, NA, NA, NA, 0.6861167~
## $ Mn <dbl> -0.47857024, -0.36084140, 0.40974742, -0.26251105, 0.07529047, 0~
## $ Fe <dbl> -0.65915484, -0.30494130, 0.64604506, -0.48601854, 0.28383043, 1~
## $ Ni <dbl> 1.17211765, -0.73894374, -0.73894374, 0.14833477, -0.92094959, 0~
## $ Cu <dbl> -0.0382265, -0.9436700, -0.2645874, -0.3777678, -1.0568504, 0.30~
## $ Zn <dbl> -0.01720942, -0.65101964, -0.39749555, 0.10955263, -0.39749555, ~
## $ As <dbl> NA, NA, -0.0963078, NA, -0.8953058, -0.2960573, NA, 2.3006863, N~
## $ Rb <dbl> 0.36277317, -0.54198093, -0.29522982, 0.67121207, -0.54198093, 0~
## $ Sr <dbl> -0.11489739, -0.84951611, 0.60409114, -0.36107281, -0.41187092, ~
## $ Y <dbl> NA, -0.85601627, -0.05383750, 0.28040365, -0.41036140, 0.3026864~
## $ Zr <dbl> -1.764549e-01, -7.787123e-01, 6.876654e-05, 1.298656e-01, -3.322~
## $ Nb <dbl> NA, NA, -0.4829764, NA, -0.8931231, NA, -0.4829764, NA, 0.952536~
Introduction
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Visualize correlations between elements
cor(pxrf_1[3:12]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements
# PCA
pca_1 <- prcomp(pxrf_1[3:12])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.9037 1.0026 0.8270 0.67213 0.37446 0.35729 0.29419
## Proportion of Variance 0.5809 0.1611 0.1096 0.07242 0.02248 0.02046 0.01387
## Cumulative Proportion 0.5809 0.7421 0.8517 0.92413 0.94661 0.96707 0.98095
## PC8 PC9 PC10
## Standard deviation 0.26346 0.16502 0.14906
## Proportion of Variance 0.01113 0.00437 0.00356
## Cumulative Proportion 0.99207 0.99644 1.00000
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by area")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam grouped by sample type")
PCA with all the feasible elements
# PCA with NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:22])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.1392 1.2623 1.1731 0.77694 0.75938 0.56778 0.50254
## Proportion of Variance 0.4455 0.1551 0.1340 0.05877 0.05614 0.03138 0.02459
## Cumulative Proportion 0.4455 0.6006 0.7346 0.79337 0.84951 0.88090 0.90548
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.44075 0.42941 0.3738 0.33174 0.29859 0.26048 0.24174
## Proportion of Variance 0.01891 0.01795 0.0136 0.01071 0.00868 0.00661 0.00569
## Cumulative Proportion 0.92439 0.94235 0.9559 0.96666 0.97534 0.98195 0.98764
## PC15 PC16 PC17 PC18 PC19 PC20
## Standard deviation 0.22407 0.16224 0.16055 0.12292 0.08709 0.04448
## Proportion of Variance 0.00489 0.00256 0.00251 0.00147 0.00074 0.00019
## Cumulative Proportion 0.99253 0.99509 0.99760 0.99907 0.99981 1.00000
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA Ashdod-Yam more elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam more elements grouped by area")
autoplot(pca_1, data=pxrf_1, colour='Type', shape = FALSE, label = TRUE, main = "PCA Ashdod-Yam more elements grouped by sample type")
Introduction
# HCA
# compute divisive hierarchical clustering
hc4 <- diana(pxrf_1[3:12])
# Divise coefficient; amount of clustering structure found
hc4$dc
## [1] 0.8007861
# Diana dendrogram
pltree(hc4, cex = 1, hang = -1, main = "Dendrogram of diana")
# HCA dendrogam 1
dend <-
pxrf_1[3:12] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=5) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=5) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward5 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=5)
Introduction
## Particle size
# Read and filter data
particle <- read.xlsx("data/granulometry/AY_grain.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
write.xlsx(particle, file="data/granulometry/AY_grain2.xlsx")
# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/granulometry/AY_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=2)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
Introduction
Introduction
Introduction
## Particle size
# Read and filter data
particle <- read.xlsx("data/granulometry/AY_grain.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
write.xlsx(particle, file="data/granulometry/AY_grain2.xlsx")
# Manually added Type and Context columns to AY_grain2 data: "AY_grain3"
grain3 <- read.xlsx("data/granulometry/AY_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
Introduction
Introduction
# Libraries
library(openxlsx);
# Read data
nn <- read.xlsx("data/pxrf/Raw_Israel.xlsx", sep=";")
nn <- as.data.frame(nn)
Introduction
## Particle size
# Read and filter data
particle <- read.xlsx("data/granulometry/israel_grain.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "israel_grain2"
write.xlsx(particle, file="data/granulometry/israel_grain2.xlsx")
# Manually added Type and Context columns to israel_grain2 data: "israel_grain3"
grain3 <- read.xlsx("data/granulometry/israel_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Type)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
Introduction
Introduction
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(ggdendro) # dendrograms together with ggplot
library(dendextend) # dendrograms - change parameters
library(tidyverse) # rectangles around HClust
# Read data
nn <- read.xlsx("data/pxrf/PP-z2-pXRF.xlsx", sep=";")
nn <- as.data.frame(nn)
# Drop unuseable elements
# nn <- nn %>% select(-Al2O3)
# nn <- nn %>% select(-SiO2)
# nn <- nn %>% select(-P2O5)
# nn <- nn %>% select(-CaO)
# For correlation and PCA "Samples" must be rownames instead of factors
rownames(nn) <- nn$Sample
n1 <- nn %>% select(-Sample)
n2 <- n1 %>% select(-Area)
n2 <- n2 %>% select(-Type)
# Area as factor, cannot be used in PCA (select your columns when using n1)
n1$Area <- factor(n1$Area)
n1$Type <- factor(n1$Type)
## ALL SAMPLES, CONTAINING SOIL
# Visualize correlations between elements
cor(n2) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
# Optimal number of clusters in K-means analysis
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(n2, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# Clustering in ggpairs
km <-kmeans(n2, centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(n2, mapping = aes(col = cluster_x))
# PCA
pca_n2 <- prcomp(n2)
summary(pca_n2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.6328 1.0883 0.8191 0.62862 0.35886 0.28046
## Proportion of Variance 0.5203 0.2312 0.1309 0.07712 0.02513 0.01535
## Cumulative Proportion 0.5203 0.7514 0.8824 0.95952 0.98465 1.00000
biplot(pca_n2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
#### ONLY MUDBRICKS
# Subset n3 containing only the mudbricks
rownames(n2)
## [1] "PP-10" "PP-11" "PP-12" "PP-13" "PP-14" "PP-15" "PP-16" "PP-17" "PP-18"
## [10] "PP-19" "PP-9" "P1" "P10" "P11" "P12" "P13" "P14" "P15"
## [19] "P16" "P17" "P18" "P19" "P2" "P20" "P21" "P22" "P23"
## [28] "P24" "P25" "P26" "P27" "P28" "P29" "P3" "P30" "P31"
## [37] "P32" "P33" "P34" "P4" "P5" "P6" "P7" "P8" "P9"
## [46] "PP-1" "PP-2" "PP-3" "PP-4" "PP-5" "PP-6" "PP-7" "PP-8"
n3 <- n2[-c(46:53), ]
rownames(n3)
## [1] "PP-10" "PP-11" "PP-12" "PP-13" "PP-14" "PP-15" "PP-16" "PP-17" "PP-18"
## [10] "PP-19" "PP-9" "P1" "P10" "P11" "P12" "P13" "P14" "P15"
## [19] "P16" "P17" "P18" "P19" "P2" "P20" "P21" "P22" "P23"
## [28] "P24" "P25" "P26" "P27" "P28" "P29" "P3" "P30" "P31"
## [37] "P32" "P33" "P34" "P4" "P5" "P6" "P7" "P8" "P9"
# Calculate the total within sum of squares to find the optimal number of clusters
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(n3, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# PCA
pca_n3 <- prcomp(n3)
summary(pca_n3)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.6735 1.1429 0.8461 0.59258 0.36664 0.25433
## Proportion of Variance 0.5212 0.2431 0.1332 0.06535 0.02502 0.01204
## Cumulative Proportion 0.5212 0.7644 0.8976 0.96294 0.98796 1.00000
biplot(pca_n3, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"))
#Visualize PC:s
plot(pca_n3, type="lines")
# 3D PCA plots
# HCA (only mudbricks)
dend <-
n3 %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=3) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=3) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
###### NEW PLOTS WITH CLASS INFO #######
# PCA
pca_01 <- prcomp(n1[3:8])
summary(pca_01)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.6328 1.0883 0.8191 0.62862 0.35886 0.28046
## Proportion of Variance 0.5203 0.2312 0.1309 0.07712 0.02513 0.01535
## Cumulative Proportion 0.5203 0.7514 0.8824 0.95952 0.98465 1.00000
n4 <- n1 %>% select(-Ni)
pca_02 <- prcomp(n4[3:7])
summary(pca_02)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.6222 0.8392 0.6292 0.38007 0.28635
## Proportion of Variance 0.6649 0.1779 0.1000 0.03649 0.02072
## Cumulative Proportion 0.6649 0.8428 0.9428 0.97928 1.00000
# PCA
biplot(pca_01, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA All elements")
autoplot(pca_01, data=n1, colour='Area', shape = FALSE, label = TRUE, main = "PCA All elements")
biplot(pca_02, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "No nickel")
autoplot(pca_02, data=n4, colour='Area', shape = FALSE, label = TRUE, main = "No nickel")
# only new MB samples
n5 <- n1 %>% select(-Ni)
n5 <- n5[-c(12:53), ]
pca_03 <- prcomp(n5[3:7])
summary(pca_03)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.5880 0.7415 0.7090 0.17542 0.07632
## Proportion of Variance 0.6984 0.1522 0.1392 0.00852 0.00161
## Cumulative Proportion 0.6984 0.8506 0.9899 0.99839 1.00000
# HCA
dend <-
n5 %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=3) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=3) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
biplot(pca_03, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "Only new MB samples")
autoplot(pca_03, data=n5, colour='Area', shape = FALSE, label = TRUE, main = "Only new MB samples")
# Wards method (HCA) method 2:
HCA.ward5 <- hclust(dist(n5, method="euclid"), method="ward.D2")
plot(HCA.ward5, main="Ward's method")
rect.hclust(HCA.ward5, k=3)
Introduction
## Particle size
# Read and filter data
particle <- read.xlsx("data/granulometry/PP_grain.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "PP_grain2"
write.xlsx(particle, file="data/granulometry/PP_grain2.xlsx")
# Manually added Type and Context columns to PP_grain2 data: "PP_grain3"
grain3 <- read.xlsx("data/granulometry/PP_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
MB_grain <- grain[c(9:19),]
MB_context <- grain3[c(9:19),]
ggtern(data=MB_grain, aes(x=Clay, y=Sand, z=Silt, color=MB_context$Locus)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(MB_grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
Introduction
Introduction
# Libraries
library(dplyr);
library(GGally);
library(corrplot);
library(tidyr);
library(openxlsx);
library(cluster);
library(ggplot2);
library(ggfortify);
library(pca3d);
library(ggdendro); # dendrograms together with ggplot
library(dendextend); # dendrograms - change parameters
library(ggtern) # ternary plots
# Read the raw data
pxrf <- read.xlsx("data/pxrf/Raw_AA.xlsx", sep=";")
# Remove any rows containing words 'ERROR' or 'TEST'
pxrf <- filter(pxrf, !grepl('TEST', Sample))
pxrf <- filter(pxrf, !grepl('ERROR', Sample))
# Turn any "LOD":s and missing values to "NA":s to allow scaling
# Removing "Na", S", "Cl", "P" and "Ba" already
values <- c("MgO", "Al2O3", "SiO2",
"K2O", "CaO", "Ti", "V", "Cr",
"Mn", "Fe", "Co", "Ni", "Cu", "Zn", "As",
"Se", "Rb", "Sr", "Y", "Zr", "Nb", "Mo",
"Rh", "Pd", "Ag", "Cd", "Sn", "Sb",
"La", "Ce", "Hf", "Ta", "W", "Pt", "Au",
"Hg", "Tl", "Pb", "Bi", "Th", "U")
pxrf_NA <- select(pxrf, one_of(values))
pxrf_NA <- pxrf_NA %>% mutate_if(is.character,as.numeric)
# Scaling the data with standard z-score method
scaled_pxrf <- as.data.frame(scale(pxrf_NA))
# Average data by "Sample", "Area" and "Type" columns from the original dataset
average_pxrf <- aggregate(scaled_pxrf, by = list(pxrf$Sample, pxrf$Area, pxrf$Type), FUN = mean)
# Assign sample names as row names, and rename the other newly created columns back to "Type" and "Area" for clarity
rownames(average_pxrf) <- average_pxrf$Group.1
average_pxrf <- average_pxrf %>% select(-Group.1)
names(average_pxrf)[names(average_pxrf) == "Group.2"] <- "Area"
names(average_pxrf)[names(average_pxrf) == "Group.3"] <- "Type"
# Change "Type" and "Area" to factors
average_pxrf$Type <- factor(average_pxrf$Type)
average_pxrf$Area <- factor(average_pxrf$Area)
# Removing columns that only have NA values
all_na <- function(x) any(!is.na(x))
pxrf_all <- average_pxrf %>% select_if(all_na)
# Removing columns if they contain any NA values at all
any_na <- function(x) all(!is.na(x))
pxrf_no_na <- average_pxrf %>% select_if(any_na)
pxrf_no_na:
glimpse(pxrf_no_na)
## Rows: 10
## Columns: 17
## $ Area <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2 <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ Mn <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ Rb <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~
pxrf_all:
glimpse(pxrf_all)
## Rows: 10
## Columns: 21
## $ Area <fct> Hill II, Hill II, Hill II, Phase I, Phase I, Phase II, Phase II,~
## $ Type <fct> mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudbrick, mudb~
## $ MgO <dbl> -0.1594010, 0.1129059, -0.4510038, -0.3887203, 0.4369466, 1.9131~
## $ Al2O3 <dbl> 0.6990613, 0.2737338, -0.1611149, -0.2592941, 0.4246452, 1.38943~
## $ SiO2 <dbl> 0.85557445, 0.61604918, -0.12790064, 0.09683497, 0.61414855, 0.7~
## $ K2O <dbl> 0.60692024, 0.02816062, -0.75789490, 0.04880237, 1.11418034, 1.0~
## $ CaO <dbl> -0.54131017, 1.20914863, 0.05989818, 0.23434386, 1.02571929, 0.9~
## $ Ti <dbl> 0.8351083, 0.8389159, -0.2277469, -0.1097123, 0.8688325, 0.73502~
## $ V <dbl> 0.02741905, 0.56361382, NA, NA, -0.43565825, -0.19193335, -0.460~
## $ Cr <dbl> NA, NA, NA, NA, -0.6861787, 0.2217828, NA, NA, NA, NA
## $ Mn <dbl> -0.16154103, 0.23307430, -0.07138547, -0.11572427, 0.55674754, 1~
## $ Fe <dbl> 0.05533421, 0.13332917, -0.15388527, -0.15233743, 0.09919632, 1.~
## $ Ni <dbl> 0.82749061, -0.04503350, -0.80497386, -0.94570355, -0.24205508, ~
## $ Cu <dbl> 8.205941e-01, 2.188251e-01, -1.039419e+00, 4.811057e-16, 4.37650~
## $ Zn <dbl> 0.41186175, -0.19830381, -0.88474007, -0.02033885, 0.33559106, 1~
## $ As <dbl> 1.10179138, -0.87461790, NA, -0.05111403, 0.27828751, 0.44298829~
## $ Rb <dbl> 1.13127170, -0.15127470, -0.51301856, -1.20362047, -0.01973148, ~
## $ Sr <dbl> -1.40588371, -0.28024838, 0.07368850, 0.63070392, 1.43141359, -0~
## $ Y <dbl> 0.57088572, 0.04228783, -0.22201111, -0.27487090, -0.01057196, 1~
## $ Zr <dbl> -0.4713101, -1.1158368, -0.5317345, -0.2094712, 1.3011383, 1.905~
## $ Nb <dbl> 0.93500042, -0.65796326, NA, -0.06060188, -0.06060188, -0.657963~
Introduction
# Drop the more dubious elements
pxrf_1 <- pxrf_no_na %>% select(-Al2O3)
pxrf_1 <- pxrf_1 %>% select(-CaO)
# Visualize correlations between elements
cor(pxrf_1[3:15]) %>% corrplot (type = "lower", tl.cex = 1, tl.pos="d", cl.pos="r")
# Optimal number of clusters in K-means analysis: 2
k_max <- 8
twcss <- sapply(1:k_max, function(k){kmeans(pxrf_1[3:12], k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
# K-means clustering in ggpairs: 2 clusters
km <-kmeans(pxrf_1[3:12], centers = 2)
cluster_x <- as.factor(km$cluster)
ggpairs(pxrf_1, mapping = aes(col = cluster_x))
PCA with only the no-NA:s elements
# PCA
pca_1 <- prcomp(pxrf_1[3:15])
summary(pca_1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4147 1.0889 0.9306 0.48309 0.37987 0.31662 0.24466
## Proportion of Variance 0.6901 0.1403 0.1025 0.02762 0.01708 0.01186 0.00708
## Cumulative Proportion 0.6901 0.8305 0.9330 0.96059 0.97767 0.98953 0.99662
## PC8 PC9 PC10
## Standard deviation 0.15630 0.06455 7.266e-17
## Proportion of Variance 0.00289 0.00049 0.000e+00
## Cumulative Proportion 0.99951 1.00000 1.000e+00
# PCA1 plots
biplot(pca_1, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA limited elements")
autoplot(pca_1, data=pxrf_1, colour='Area', shape = FALSE, label = TRUE, main = "PCA grouped by area")
PCA with all the feasible elements
# PCA with all elements, NA:s converted to zeros
pxrf_all[is.na(pxrf_all)] <- 0
pca_2 <- prcomp(pxrf_all[3:21])
summary(pca_2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.6421 1.2026 0.98838 0.84185 0.45012 0.39002 0.30457
## Proportion of Variance 0.6529 0.1353 0.09136 0.06628 0.01895 0.01423 0.00868
## Cumulative Proportion 0.6529 0.7881 0.87946 0.94574 0.96469 0.97891 0.98759
## PC8 PC9 PC10
## Standard deviation 0.28180 0.23084 2.238e-16
## Proportion of Variance 0.00743 0.00498 0.000e+00
## Cumulative Proportion 0.99502 1.00000 1.000e+00
# PCA2 plots
biplot(pca_2, choices = 1:2, cex = c(1, 1.2), col = c("grey80", "deeppink2"), main = "PCA more elements")
autoplot(pca_2, data=pxrf_all, colour='Area', shape = FALSE, label = TRUE, main = "PCA more elements grouped by area")
Introduction
# HCA
# HCA dendrogam 1
dend <-
pxrf_1[3:12] %>% # data
dist %>% # calculate a distance matrix
hclust(method = "ward.D2") %>% # hierarchical clustering
as.dendrogram %>% # turn the object into a dendrogram
set("branches_k_color", k=4) %>%
set("branches_lwd", 1.2) %>%
set("labels_col", c("red", "black", "blue"), k=4) %>%
set("labels_cex", 1) %>%
set("leaves_pch", NA)
# plot the dend in usual "base" plotting engine:
plot(dend)
# HCA dendrogam 2:
HCA.ward4 <- hclust(dist(pxrf_1[3:12], method="euclid"), method="ward.D2")
plot(HCA.ward4, main="HCA with clusters")
rect.hclust(HCA.ward4, k=4)
Introduction
## Particle size
# Read and filter data
particle <- read.xlsx("data/granulometry/AA_grain.xlsx", sep=";")
# Remove automatically created averages in order to include results from multiple runs for the same sample
particle <- particle %>%
filter(!grepl('Average', Sample)) %>%
filter(!grepl('test', Sample))
# Average by sample name
particle <- aggregate(particle, by=list(particle$Sample), FUN=mean)
particle <- particle %>% select(-Sample)
names(particle)[names(particle) == "Group.1"] <- "Sample"
# Saving the clean data as "AY_grain2"
write.xlsx(particle, file="data/granulometry/AA_grain2.xlsx")
# Manually added Type and Context columns to AA_grain2 data: "AA_grain3"
grain3 <- read.xlsx("data/granulometry/AA_grain3.xlsx", sep=";")
# Samples as rownames
rownames(grain3) <- grain3$Sample
# Scaling clay-silt-sand portions to values between 0 and 1
## Otherwise the differences in low clay percentages are lost
normalize <- function(x, na.rm=TRUE){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
grain <- as.data.frame(apply(grain3[3:5], 2, normalize))
# Ternary plots
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_rgbw() +
theme_nomask() +
geom_point(size=0) +
geom_text(aes(label=rownames(grain)), size=3)
ggtern(data=grain, aes(x=Clay, y=Sand, z=Silt, color=grain3$Context)) +
labs(title="Clay silt sand") +
theme_nomask() +
geom_mask() +
geom_point(size=2)
Introduction
Introduction